Przygotowanie środowiska

package_Vector <- c(
  "dplyr",
  "data.table",
  "ggplot2",
  "corrplot",
  "reshape2",
  "plyr",
  "plotly",
  "caret"

)


for (package in package_Vector) {
  if (!require(package, character.only=T, quietly=T)) {
    install.packages(package)
    library(package, character.only=T)
  }
}
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## corrplot 0.84 loaded
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Wczytanie i przetworzenie danych

#pobierz plik
if (file.exists(paste(getwd(),"/","sledzie.csv",sep = ""))==FALSE) {
  download.file("http://www.cs.put.poznan.pl/dbrzezinski/teaching/sphd/sledzie.csv", paste(getwd(),"/","sledzie.csv",sep = ""))
}
#czytaj dane
daneSurowe<- read.csv("sledzie.csv",na.strings = c("?"))


#Zastap NA wartosciami srednimi dla grup
srednieGrupowe <- ddply(daneSurowe,c("totaln","xmonth"),numcolwise(mean,na.rm=TRUE))
f1 <- function(x) {
  for (icol in 1:ncol(x)) {
    ifelse(!is.na(x[,icol]),(x[,icol]),(x[,icol] <- subset(srednieGrupowe, (srednieGrupowe$totaln == x$totaln) & (srednieGrupowe$xmonth == x$xmonth),colnames(x)[icol])))
    }
  return (x)
  }



daneBezNA <- ddply(daneSurowe,1,f1)

daneBezOutliers <- as.data.frame(apply(daneBezNA[,c("length", "cfin1", "cfin2", "chel1", "chel2", "lcop1", "lcop2")],2,function(x) {ifelse((x>(mean(x)+3*sd(x))) | (x<(mean(x)-3*sd(x))),NA,x)}))
daneBezOutliers <- daneBezNA[complete.cases(daneBezOutliers),]
dane <- daneBezOutliers

Podstawowe statystyki

PoniĹĽej znajdujÄ… siÄ™ podstawowe statystyki opisowe dla zbioru danych

summary(dane[,c("length","cfin1","cfin2","chel1","chel2","lcop1","lcop2","fbar","recr","cumf","totaln","sst","sal","xmonth","nao")])
##      length         cfin1            cfin2             chel1       
##  Min.   :20.5   Min.   :0.0000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.:24.0   1st Qu.:0.0000   1st Qu.: 0.2500   1st Qu.: 2.267  
##  Median :25.0   Median :0.1111   Median : 0.7012   Median : 5.095  
##  Mean   :25.2   Mean   :0.3929   Mean   : 1.5154   Mean   : 7.263  
##  3rd Qu.:26.5   3rd Qu.:0.3333   3rd Qu.: 1.5805   3rd Qu.:11.500  
##  Max.   :30.0   Max.   :3.1446   Max.   :13.1434   Max.   :36.333  
##      chel2            lcop1             lcop2             fbar       
##  Min.   : 5.238   Min.   : 0.3074   Min.   : 7.849   Min.   :0.0680  
##  1st Qu.:13.427   1st Qu.: 2.5030   1st Qu.:17.808   1st Qu.:0.2010  
##  Median :20.158   Median : 5.9167   Median :24.859   Median :0.3180  
##  Mean   :20.228   Mean   :10.1994   Mean   :26.609   Mean   :0.3169  
##  3rd Qu.:26.812   3rd Qu.:16.3656   3rd Qu.:36.190   3rd Qu.:0.4030  
##  Max.   :39.568   Max.   :41.1667   Max.   :64.861   Max.   :0.8490  
##       recr              cumf             totaln             sst       
##  Min.   : 140515   Min.   :0.06833   Min.   : 144137   Min.   :12.77  
##  1st Qu.: 364794   1st Qu.:0.14809   1st Qu.: 306160   1st Qu.:13.63  
##  Median : 459347   Median :0.22438   Median : 539558   Median :13.87  
##  Mean   : 538305   Mean   :0.22252   Mean   : 521658   Mean   :13.91  
##  3rd Qu.: 724151   3rd Qu.:0.28116   3rd Qu.: 730351   3rd Qu.:14.21  
##  Max.   :1380210   Max.   :0.39801   Max.   :1015595   Max.   :14.73  
##       sal            xmonth           nao         
##  Min.   :35.40   Min.   : 1.00   Min.   :-3.7800  
##  1st Qu.:35.51   1st Qu.: 5.00   1st Qu.:-1.5400  
##  Median :35.51   Median : 8.00   Median : 0.3400  
##  Mean   :35.51   Mean   : 7.22   Mean   : 0.1526  
##  3rd Qu.:35.52   3rd Qu.: 9.00   3rd Qu.: 1.8000  
##  Max.   :35.61   Max.   :12.00   Max.   : 5.0800

Analiza wartości atrybutów

meltData <- melt(dane[,c("length", "cfin1", "cfin2", "chel1", "chel2", "lcop1", "lcop2", "cumf", "sst", "sal", "nao")],na.rm = TRUE)
## No id variables; using all as measure variables
p <- ggplot(meltData, aes(factor(variable),value))
p + geom_boxplot() + facet_wrap(~variable, scale = "free")

Korelacje pomiędzy zmiennymi

corelation <- cor((dane[,c("length","cfin1","cfin2","chel1","chel2","lcop1","lcop2","fbar","recr","cumf", "xmonth", "sst", "nao", "sal")]))

corrplot(corelation, col = colorRampPalette(c("blue","white","red"))(5), title = "Korelacje pomiędzy zmiennymi", method = "ellipse", outline = T, diag=FALSE, addgrid.col = "darkgray", order="hclust", mar = c(4,0,4,0), addrect = 4, rect.col = "black", rect.lwd = 5,cl.pos = "b", tl.col = "indianred4")

Interaktywny wykres długości śledzia

#plot_ly(dane,x=dane$fbar,y=dane$length, type ="scatter", mode ="markers")

dane$fbarFactor <- as.factor(ifelse(dane$fbar<quantile(dane$fbar,0.125),"Q.125",ifelse(dane$fbar>=quantile(dane$fbar,0.125) & dane$fbar<quantile(dane$fbar,0.25),"Q.250",ifelse(dane$fbar>=quantile(dane$fbar,0.25) & dane$fbar<quantile(dane$fbar,0.375),"Q.375",ifelse(dane$fbar>=quantile(dane$fbar,0.375) & dane$fbar<quantile(dane$fbar,0.50),"Q.500",ifelse(dane$fbar>=quantile(dane$fbar,0.50),"Q.99","0"))))))
p <- plot_ly(dane,
    x = ~fbarFactor,
    y = ~length,
    split= ~fbarFactor,
    type='box'
) %>% layout(title = "Długość śledzia w czasie jego życia",
         xaxis = list(title = "Kwantyl"),
         yaxis = list (title = "Długość śledzia"))
p

Regresor przewidujacy rozmiar śledzia

#plot_ly(dane,x=dane$fbar,y=dane$length, type ="scatter", mode ="markers")

dane$newlength <- (round(dane$length/0.5)*0.5)

set.seed(123)
inTraining <- createDataPartition(y=dane$length,p=0.75,list=FALSE)


ctrl <- caret::trainControl(method="repeatedcv",number = 2, repeats = 5)
fit <- train(dane[inTraining,-which(names(dane) %in% c("X","length","newlength","totaln","fbarFactor"))],
             dane[inTraining,which(names(dane) == "length")],
             method= "enet",
             trControl = ctrl,
             ntree =10,
             importance=T)


fit
## Elasticnet 
## 
## 36377 samples
##    13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold, repeated 5 times) 
## Summary of sample sizes: 18188, 18189, 18189, 18188, 18188, 18189, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE      Rsquared   MAE     
##   0e+00   0.050     1.568187  0.1893865  1.268536
##   0e+00   0.525     1.390346  0.2858234  1.114684
##   0e+00   1.000     1.365824  0.3055324  1.091980
##   1e-04   0.050     1.568234  0.1893865  1.268574
##   1e-04   0.525     1.390396  0.2857795  1.114740
##   1e-04   1.000     1.365823  0.3055324  1.091991
##   1e-01   0.050     1.589116  0.1893865  1.285737
##   1e-01   0.525     1.420142  0.2569521  1.143091
##   1e-01   1.000     1.381125  0.2908730  1.107573
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 1 and lambda = 1e-04.
rfClasses <- predict(fit, newdata = dane[-inTraining,-which(names(dane) %in% c("X","length","newlength","totaln","fbarFactor"))],type="raw")
rfClasses2 <- (round(rfClasses/0.5)*0.5)

confusionMatrix(data = factor(rfClasses2,levels = unique(dane[-inTraining,which(names(dane) == "newlength")])),
                reference = factor(dane[-inTraining,which(names(dane) == "newlength")]))
## Warning in confusionMatrix.default(data = factor(rfClasses2, levels =
## unique(dane[-inTraining, : Levels are not in the same order for reference
## and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction 20.5  21 21.5  22 22.5  23 23.5  24 24.5  25 25.5  26 26.5  27
##       20.5    0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       21      0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       21.5    0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       22      0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       22.5    0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       23      0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       23.5    7  11   32 104  145 184  172 170  117 103   49  26    9   3
##       24     10  40   40  82  119 127  129 119   74  43   30  17    1   1
##       24.5    0   4   12  35   47 103  119 162  177 172  150 111  101  46
##       25      0   1    1   9   24  67  116 155  215 273  254 245  163 128
##       25.5    1   4   19  33   52 133  214 370  510 504  548 468  377 219
##       26      0   0    0   4    9  29   54 117  200 267  305 377  366 311
##       26.5    0   0    0   0    3   3    4  21   31  19   19  13   25  27
##       27      0   0    0   0    0   2    3   8    5  19   20  23   49  74
##       27.5    0   0    0   0    0   0    1   1    2   2    4   7    8   4
##       28      0   0    0   0    0   1    4   2    6   4    4  10    9  17
##       28.5    0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       29      0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       29.5    0   0    0   0    0   0    0   0    0   0    0   0    0   0
##       30      0   0    0   0    0   0    0   0    0   0    0   0    0   0
##           Reference
## Prediction 27.5  28 28.5  29 29.5  30
##       20.5    0   0    0   0    0   0
##       21      0   0    0   0    0   0
##       21.5    0   0    0   0    0   0
##       22      0   0    0   0    0   0
##       22.5    0   0    0   0    0   0
##       23      0   0    0   0    0   0
##       23.5    0   1    0   0    0   0
##       24      2   0    0   0    0   0
##       24.5   17   6    4   1    0   0
##       25     53  29   12  11    1   2
##       25.5  162  99   66  22    8   3
##       26    264 164   78  21    7   3
##       26.5   20  14    9   2    0   0
##       27     70  54   44  35    6   0
##       27.5    4   7    5   3    0   0
##       28     10   3    2   0    0   0
##       28.5    0   0    0   0    0   0
##       29      0   0    0   0    0   0
##       29.5    0   0    0   0    0   0
##       30      0   0    0   0    0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1462          
##                  95% CI : (0.1399, 0.1526)
##     No Information Rate : 0.116           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.0471          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 20.5 Class: 21 Class: 21.5 Class: 22
## Sensitivity             0.000000  0.000000    0.000000   0.00000
## Specificity             1.000000  1.000000    1.000000   1.00000
## Pos Pred Value               NaN       NaN         NaN       NaN
## Neg Pred Value          0.998515  0.995051    0.991421   0.97798
## Prevalence              0.001485  0.004949    0.008579   0.02202
## Detection Rate          0.000000  0.000000    0.000000   0.00000
## Detection Prevalence    0.000000  0.000000    0.000000   0.00000
## Balanced Accuracy       0.500000  0.500000    0.500000   0.50000
##                      Class: 22.5 Class: 23 Class: 23.5 Class: 24
## Sensitivity              0.00000   0.00000     0.21078  0.105778
## Specificity              1.00000   1.00000     0.91501  0.934988
## Pos Pred Value               NaN       NaN     0.15181  0.142686
## Neg Pred Value           0.96709   0.94647     0.94140  0.910887
## Prevalence               0.03291   0.05353     0.06731  0.092799
## Detection Rate           0.00000   0.00000     0.01419  0.009816
## Detection Prevalence     0.00000   0.00000     0.09346  0.068795
## Balanced Accuracy        0.50000   0.50000     0.56290  0.520383
##                      Class: 24.5 Class: 25 Class: 25.5 Class: 26
## Sensitivity               0.1324   0.19417      0.3962    0.2907
## Specificity               0.8989   0.86134      0.6961    0.7969
## Pos Pred Value            0.1397   0.15520      0.1438    0.1464
## Neg Pred Value            0.8931   0.89068      0.8995    0.9036
## Prevalence                0.1103   0.11598      0.1141    0.1070
## Detection Rate            0.0146   0.02252      0.0452    0.0311
## Detection Prevalence      0.1045   0.14510      0.3144    0.2125
## Balanced Accuracy         0.5157   0.52775      0.5462    0.5438
##                      Class: 26.5 Class: 27 Class: 27.5 Class: 28
## Sensitivity             0.022563  0.089157    0.006645 0.0079576
## Specificity             0.983205  0.970070    0.996181 0.9941257
## Pos Pred Value          0.119048  0.179612    0.083333 0.0416667
## Neg Pred Value          0.909091  0.935445    0.950476 0.9689652
## Prevalence              0.091397  0.068465    0.049658 0.0310979
## Detection Rate          0.002062  0.006104    0.000330 0.0002475
## Detection Prevalence    0.017322  0.033985    0.003959 0.0059391
## Balanced Accuracy       0.502884  0.529613    0.501413 0.5010416
##                      Class: 28.5 Class: 29 Class: 29.5 Class: 30
## Sensitivity              0.00000  0.000000    0.000000 0.0000000
## Specificity              1.00000  1.000000    1.000000 1.0000000
## Pos Pred Value               NaN       NaN         NaN       NaN
## Neg Pred Value           0.98185  0.992164    0.998185 0.9993401
## Prevalence               0.01815  0.007836    0.001815 0.0006599
## Detection Rate           0.00000  0.000000    0.000000 0.0000000
## Detection Prevalence     0.00000  0.000000    0.000000 0.0000000
## Balanced Accuracy        0.50000  0.500000    0.500000 0.5000000

Analiza waznosci zmiennych objasniajacych

varImp(fit,scale="TRUE")
## loess r-squared variable importance
## 
##        Overall
## sst    100.000
## fbar    22.516
## nao     18.869
## lcop1   11.800
## chel1    6.967
## chel2    5.473
## sal      5.056
## cfin1    2.791
## cumf     1.412
## lcop2    1.401
## recr     1.100
## cfin2    0.389
## xmonth   0.000